package cn.wit.luohui.day12;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.junit.Test;

public class DemoCrawler {//爬虫
	//
	private static final String URL_PATTERN="((ht|f)tps?):\\/\\/[\\w\\-]+(\\.[\\w\\-]+)+([\\w\\-"
			+ "\\.,@?^=%&:\\/~\\+#]*[\\w\\-\\@?^=%&\\/~\\+#])?";

	@Test
	public void test1() {
		parseLine("http:\\baidu.com");
	}
	public static void main(String[] args) {
		//如何下载一个网页
		try {//读网站
			URL url=new URL("http://www.jikedaohang.com");//把网页传给url，下载它里面所有的网页
				InputStream webStream=url.openStream();
				BufferedReader reader=new BufferedReader(
						new InputStreamReader(webStream,"utf-8"));
				
				String line=null;
				while((line=reader.readLine())!=null) {
					//System.out.println(line);//打印网页所有内容
					parseLine(line);
				}
		}catch(IOException e) {
			e.printStackTrace();
		}
		
	}
	//把每一行的url提取出来
	public static void parseLine(String line) { 
		//pattern,使用一个正则表达式，去字符串里面匹配子串
		Pattern pattern=Pattern.compile(URL_PATTERN);
		
		//每次匹配到的字符串，matcher类
		Matcher matcher=pattern.matcher(line);
		
		while(matcher.find()) {//找到下一个匹配的字符串
			System.out.println(matcher.group());//取出匹配的字符串
			
		}
	}
}
